Load required R packages and scripts.
# source("https://bioconductor.org/biocLite.R")
# biocLite("TCGAbiolinks")
library(TCGAbiolinks)
library(biomaRt)
library(SummarizedExperiment)
library(edgeR)
library(singscore)
library(plyr)
library(DT)
library(survival)
library(ggfortify)
library(RColorBrewer)
source("./script/Survival_analysis.R")
First, we read the expression data into R. We have already downloaded the FPKM RNASeq format of the metastatic SKCM samples from TCGA, and converted it to an R data object and saved it. We load the FPKM level data which is filtered for protein-coding genes. You can read any expression data at this step.
load("./output/TCGA_RNAseq_FPKM_proteinCoding.RData")
Now we would like to score samples against several signatures. Here we read in the signatures; these are csv or txt files that have at least a column of gene symbol and and a column of gene direction (if applicable). You can read in your own signatures here if you wish. Then we store gene names using different variable names.
##------- Read in the NK signature
nk_signature <- read.csv("./data/NK_genes_HGNC.csv", stringsAsFactors = F)
##--- store gene symbol in variable "nk" in R
nk <- as.character(nk_signature$Symbol)
##------- Read in a data frame that has information relate to both Epi and Mes signatures
emt_signature <- read.table("./data/Thiery_EMTsignature_both_tumour_cellLine_EntrezIDs.txt",
header = T, sep = "\t")
##--- Extract Epi signature
epi <- emt_signature$HGNC.symbol[emt_signature$epiMes_tumor == "epi"]
epi <- as.character(epi[complete.cases(epi)])
##--- Extract Mes signature
mes <- emt_signature$HGNC.symbol[emt_signature$epiMes_tumor == "mes"]
mes <- as.character(mes[complete.cases(mes)])
##-------- Read in the TGFb-EMT signature
tgfb_signature <- read.table("./data/Foroutan2016_TGFb_EMT_signature_upDown.txt",
header = T, sep = "\t")
##---- Store up- and down- gene sets separately
tgfb_up <- as.character(tgfb_signature$Symbol[tgfb_signature$upDown == "up"])
tgfb_dn <- as.character(tgfb_signature$Symbol[tgfb_signature$upDown == "down"])
The scoring method that we use is a rank-based method; therefore, we rank genes in each sample based on expression aboundance. We use the ranked data in the next step to score samples using singscore method.
exprData <- assay(rnaseq_fpkm)
tcgaRank <- rankGenes(exprData)
Score samples against four signatures: NK, Epi, MEs and TGFb-EMT:
nkScore_tcga <- simpleScore(
rankData = tcgaRank,
upSet = nk,
centerScore = T,
knownDirection = T
)
## Warning in singscoring(rankData, upSet = upSet, downSet = downSet,
## subSamples = subSamples, : 1 genes missing: KIR2DS4
epiScore_tcga <- simpleScore(
rankData = tcgaRank,
upSet = epi,
centerScore = T,
knownDirection = T
)
## Warning in singscoring(rankData, upSet = upSet, downSet = downSet,
## subSamples = subSamples, : 2 genes missing: C1orf106, OR7E14P
mesScore_tcga <- simpleScore(
rankData = tcgaRank,
upSet = mes,
centerScore = T,
knownDirection = T
)
## Warning in singscoring(rankData, upSet = upSet, downSet = downSet,
## subSamples = subSamples, : 4 genes missing: GUCY1B3, KIAA1462, LHFP, PTRF
tgfbScore_tcga <- simpleScore(
rankData = tcgaRank,
upSet = tgfb_up,
downSet = tgfb_dn,
centerScore = T,
knownDirection = T
)
## Warning in singscoring(rankData, upSet = upSet, downSet = downSet,
## subSamples = subSamples, : 12 genes missing: PFTK1, BHLHB2, PSCD1, C5orf13,
## KIAA0692, LOH3CR2A, C4orf18, CCDC99, FLJ10357, CXCR7, FLJ14213, C6orf145
## Warning in singscoring(rankData, downSet, NULL, subSamples, FALSE,
## dispersionFun): 9 genes missing: TACSTD1, SDPR, HRASLS3, KIAA0182,
## FLJ20273, RBM35A, GOLSYN, SQRDL, DEPDC6
epiScore_tcga$sampleID <-
mesScore_tcga$sampleID <-
tgfbScore_tcga$sampleID <-
nkScore_tcga$sampleID <-
rnaseq_fpkm$sample
Plot landscape of NK scores versus epithelial scores:
plotScoreLandscape(scoredf1 = epiScore_tcga,
scoredf2 = nkScore_tcga,
scorenames = c("Epithelial scores", "NK scores"),
textSize = 1,
isInteractive = T,
hexMin = 100)
Plot landscape of NK scores versus Mesenchymal scores:
plotScoreLandscape(scoredf1 = mesScore_tcga,
scoredf2 = nkScore_tcga,
scorenames = c("Mesenchymal scores", "NK scores"),
textSize = 1,
isInteractive = T,
hexMin = 100)
Plot landscape of NK scores versus TGFb-EMT scores:
plotScoreLandscape(scoredf1 = tgfbScore_tcga,
scoredf2 = nkScore_tcga,
scorenames = c("TGFb-EMT scores", "NK scores"),
textSize = 1,
isInteractive = T,
hexMin = 100)
There is a positive correlation between TGFb-EMT and mesenchymal scores:
plotScoreLandscape(scoredf1 = tgfbScore_tcga,
scoredf2 = mesScore_tcga,
scorenames = c("TGFb-EMT scores", "Mesenchymal scores"),
textSize = 1,
isInteractive = T,
hexMin = 100)
We would like to have only one data set for all the scores, so we need t merge them. Here we make the score data sets ready to merge. Basically, we only extract the sampleID and scores and re-name the score column to the signature name that we used to score samples.
nkScore_tcga <- nkScore_tcga[, c("sampleID", "TotalScore")]
colnames(nkScore_tcga)[2] <- "NK_scores"
epiScore_tcga <- epiScore_tcga[, c("sampleID", "TotalScore")]
colnames(epiScore_tcga)[2] <- "Epithelial_scores"
mesScore_tcga <- mesScore_tcga[, c("sampleID", "TotalScore")]
colnames(mesScore_tcga)[2] <- "Mesenchymal_scores"
tgfbScore_tcga <- tgfbScore_tcga[, c("sampleID", "TotalScore")]
colnames(tgfbScore_tcga)[2] <- "TGFbEMT_scores"
Now, we merge all the scores and export them.
multmerge = function(data){
Reduce(function(x,y) {merge(x, y, by = "sampleID")}, data)
}
allScores <- multmerge(list(nkScore_tcga,
epiScore_tcga,
mesScore_tcga,
tgfbScore_tcga))
# write.csv(allScores, "./output/allScores_MetMelanoma_TCGA_FPKM.csv",
# row.names = F)
Look at the merged data
DT::datatable(allScores, filter = "top")
# runApp("./shiny_app")
We can also look at the samples with highest or lowest scores for a given signature. For example, in the code below, we look at samples with the highest and lowest NK scores. You can change the “nkScore_tcga” with any other score data obtained from simpleScore() function, and accordingly, change the column name, e.g. “NK_scores”.
highScore <- row.names(nkScore_tcga)[nkScore_tcga$NK_scores == max (nkScore_tcga$NK_scores)]
lowScore <- row.names(nkScore_tcga)[nkScore_tcga$NK_scores == min (nkScore_tcga$NK_scores)]
plotRankDensity(rankData = tcgaRank[, highScore, drop = F],
upSet = nk,
isInteractive = T)
## Warning in plotRankDensity_intl(rankData, upSet = upSet, downSet =
## downSet, : 1 genes missing: KIR2DS4
plotRankDensity(rankData = tcgaRank[, lowScore, drop = F],
upSet = nk,
isInteractive = T)
## Warning in plotRankDensity_intl(rankData, upSet = upSet, downSet =
## downSet, : 1 genes missing: KIR2DS4
In this section, we would like to look at the associations between different variables and survival outcome. These variables can be from the below list that stratifies samples for survival analysis:
We can use the function survival_analysis() to plot Kaplan-Meier curves given any of the scenarios mentioned above. The inputs to this functions are:
First, we load this function, and prepare different data that we feed into the function:
dat <- rnaseq_fpkm
assay(dat) <- log2(assay(dat) + 1)
colnames(allScores)[1] <- "sample"
nk_tgfb_scores <- allScores[, c("sample", "NK_scores", "TGFbEMT_scores")]
Look at subsets of expression and score data:
datatable(assay(dat)[1:10,1:4], filter = "top")
datatable(nk_tgfb_scores, filter = "top")
First we reproduce the survival curves in figure 2 of the paper. These include the associations between age or age/expresssion of some genes with survival.
checkGenes <- c("IFNG", "KLRD1", "IL15", "B2M")
survival_analysis (
data = dat,
stratify = "covariate",
scores = NULL,
gene = NULL,
covariate = "age_at_diagnosis"
)
for(g in checkGenes){
print(survival_analysis (
data = dat,
stratify = "covariate_expr",
scores = NULL,
gene = g,
covariate = "age_at_diagnosis"
))
}
survival_analysis (
data = dat,
stratify = "score",
scores = nk_tgfb_scores,
gene = NULL,
covariate = NULL
)
# survival_analysis (
# data = dat,
# stratify = "expr",
# scores = NULL,
# gene = "XCL1",
# covariate = NULL
# )
checkGenes <- c("XCL1", "CCL5", "GZMB", "FAS", "CD96", "CD3E", "CD8B", "IL15", "IL15RA")
for(g in checkGenes){
print(survival_analysis (
data = dat,
stratify = "expr",
scores = NULL,
gene = g,
covariate = NULL
))
}
survival_analysis (
data = dat,
stratify = "expr_expr",
scores = NULL,
gene = c("IL15", "CISH"),
covariate = NULL
)
survival_analysis (
data = dat,
stratify = "score_expr",
scores = nk_tgfb_scores,
gene = "CISH",
covariate = NULL
)
## You can use the code below to save any single plot separately; simply run the code below after running the survival analysis for one scenario and make sure that each time you change the name of the exported file.
# ggsave(
# paste0("survival_analysis_IL15_CISH.png"),
# device = "png",
# path = "./figure/",
# width = 6,
# height = 4,
# units = "in"
# )
For this figure, we have partitioned samples into two groups: young and old, then stratify patients according to NK score and TGFb-EMT score.
medAge <- round(median(colData(dat)$age_at_diagnosis/365, na.rm = T), 1)
dat2 <- dat[, complete.cases(colData(dat)$age_at_diagnosis)]
dat_young <- dat2[, colData(dat2)$age_at_diagnosis/365 < medAge ]
dat_old <- dat2[, colData(dat2)$age_at_diagnosis/365 > medAge ]
survival_analysis (
data = dat_young,
stratify = "score_score",
scores = nk_tgfb_scores,
gene = NULL,
covariate = NULL
)
survival_analysis (
data = dat_old,
stratify = "score_score",
scores = nk_tgfb_scores,
gene = NULL,
covariate = NULL
)
```